Reading the Data

rawdata <- read_csv('data/nycschooldata.csv')
## Rows: 1272 Columns: 161
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (36): Adjusted Grade, New?, Other Location Code in LCGMS, School Name, ...
## dbl (125): SED Code, District, Latitude, Longitude, Zip, Grade 3 ELA - All S...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
school_df <- rawdata %>%
  clean_names() %>%
  select(4:41) %>%
  na_if('N/A') %>%
  mutate(
    #community_school = ifelse(community_school == 'Yes', 1, 0),
    economic_need_index = as.numeric(economic_need_index),
    school_income_estimate = as.numeric(
      gsub('[$,]', '', school_income_estimate)
      ),
    percent_ell = as.numeric(gsub('[%]', '', percent_ell)) / 100,
    percent_asian = as.numeric(gsub('[%]', '', percent_asian)) / 100,
    percent_black = as.numeric(gsub('[%]', '', percent_black)) / 100,
    percent_hispanic = as.numeric(gsub('[%]', '', percent_hispanic)) / 100,
    percent_blhi = as.numeric(gsub('[%]', '', percent_black_hispanic)) / 100,
    percent_white = as.numeric(gsub('[%]', '', percent_white)) / 100,
    attendance_rate = as.numeric(
      gsub('[%]', '', student_attendance_rate)
      ) / 100,
    chronically_absent = as.numeric(
      gsub('[%]', '', percent_of_students_chronically_absent)
      ) / 100,
    rigorous_instruction = as.numeric(
      gsub('[%]', '', rigorous_instruction_percent)
      ) / 100,
    collab_teachers = as.numeric(
      gsub('[%]', '', collaborative_teachers_percent)
    ) / 100,
    supportive_env = as.numeric(
      gsub('[%]', '', supportive_environment_percent)
    ) / 100,
    leadership = as.numeric(
      gsub('[%]', '', effective_school_leadership_percent)
    ) / 100,
    fam_com_ties = as.numeric(
      gsub('[%]', '', strong_family_community_ties_percent)
    ) / 100,
    trust = as.numeric(gsub('[%]', '', trust_percent)) / 100,
    ela_proficiency = as.numeric(average_ela_proficiency),
    math_proficiency = as.numeric(average_math_proficiency),
    avg_proficiency = (ela_proficiency + math_proficiency) / 2
  ) %>%
  select(
    -sed_code, -percent_black_hispanic, -student_attendance_rate,
    -percent_of_students_chronically_absent, -collaborative_teachers_percent,
    -rigorous_instruction_percent, -supportive_environment_percent,
    -effective_school_leadership_percent, -strong_family_community_ties_percent,
    -trust_percent, -average_ela_proficiency, -average_math_proficiency, 
    -grades, -grade_low, -grade_high, -ends_with('rating')
  )

loc_df <- school_df %>% 
  select(school_name, location_code, district, latitude, 
         longitude, address_full, city, zip)

cor(select_if(school_df, is.numeric), use="pairwise.complete.obs") %>%
  corrplot(method = 'shade', type = 'lower', diag = FALSE)

eda_p1 <- ggplot(school_df, 
                 aes(x = ela_proficiency, y = math_proficiency,
                     color = school_income_estimate)) +
  geom_point()

ggplot(school_df, aes(x = school_income_estimate)) + 
  geom_histogram(bins=35)
## Warning: Removed 396 rows containing non-finite values (`stat_bin()`).

unique(school_df$city)
##  [1] "NEW YORK"            "ROOSEVELT ISLAND"    "BRONX"              
##  [4] "BROOKLYN"            "ELMHURST"            "WOODSIDE"           
##  [7] "CORONA"              "MIDDLE VILLAGE"      "MASPETH"            
## [10] "RIDGEWOOD"           "GLENDALE"            "LONG ISLAND CITY"   
## [13] "FLUSHING"            "COLLEGE POINT"       "WHITESTONE"         
## [16] "BAYSIDE"             "QUEENS VILLAGE"      "LITTLE NECK"        
## [19] "DOUGLASTON"          "FLORAL PARK"         "BELLEROSE"          
## [22] "JAMAICA"             "ARVERNE"             "FAR ROCKAWAY"       
## [25] "SOUTH OZONE PARK"    "BROAD CHANNEL"       "RICHMOND HILL"      
## [28] "WOODHAVEN"           "SOUTH RICHMOND HILL" "OZONE PARK"         
## [31] "ROCKAWAY PARK"       "HOWARD BEACH"        "ROCKAWAY BEACH"     
## [34] "KEW GARDENS"         "FOREST HILLS"        "REGO PARK"          
## [37] "SPRINGFIELD GARDENS" "HOLLIS"              "SAINT ALBANS"       
## [40] "ROSEDALE"            "CAMBRIA HEIGHTS"     "JACKSON HEIGHTS"    
## [43] "ASTORIA"             "EAST ELMHURST"       "STATEN ISLAND"
r <- GET('http://data.beta.nyc//dataset/0ff93d2d-90ba-457c-9f7e-39e47bf2ac5f/resource/35dd04fb-81b3-479b-a074-a27a37888ce7/download/d085e2f8d0b54d4590b1e7d1f35594c1pediacitiesnycneighborhoods.geojson')

nycmap <- rgdal::readOGR(content(r, 'text'), 'OGRGeoJSON', verbose=F)
## Warning: OGR support is provided by the sf and terra packages among others
## No encoding supplied: defaulting to UTF-8.
## Warning: OGR support is provided by the sf and terra packages among others
## Warning: OGR support is provided by the sf and terra packages among others
## Warning: OGR support is provided by the sf and terra packages among others
## Warning: OGR support is provided by the sf and terra packages among others
## Warning: OGR support is provided by the sf and terra packages among others
nycmaps_df <- tidy(nycmap)
## Regions defined for each Polygons
ggplot() +
  geom_polygon(data=nycmaps_df, aes(x=long, y=lat, group=group),
               color='white', fill='lightgrey') +
  geom_point(data=school_df, 
             mapping=aes(x = longitude, y = latitude, color=city)) +
  labs(title='Schools in the New York City Area by City') +
  theme(panel.background = element_blank(), legend.position = 'none',
        axis.title.x = element_blank(), axis.title.y = element_blank(), 
        axis.text.x = element_blank(), axis.text.y = element_blank(),
        axis.ticks.x = element_blank(), axis.ticks.y = element_blank())

school_df
## # A tibble: 1,272 × 28
##    school_…¹ locat…² distr…³ latit…⁴ longi…⁵ addre…⁶ city    zip commu…⁷ econo…⁸
##    <chr>     <chr>     <dbl>   <dbl>   <dbl> <chr>   <chr> <dbl> <chr>     <dbl>
##  1 P.S. 015… 01M015        1    40.7   -74.0 333 E … NEW … 10009 Yes       0.919
##  2 P.S. 019… 01M019        1    40.7   -74.0 185 1S… NEW … 10003 No        0.641
##  3 P.S. 020… 01M020        1    40.7   -74.0 166 ES… NEW … 10002 No        0.744
##  4 P.S. 034… 01M034        1    40.7   -74.0 730 E … NEW … 10009 No        0.86 
##  5 THE STAR… 01M063        1    40.7   -74.0 121 E … NEW … 10009 No        0.73 
##  6 P.S. 064… 01M064        1    40.7   -74.0 600 E … NEW … 10009 No        0.858
##  7 P.S. 110… 01M110        1    40.7   -74.0 285 DE… NEW … 10002 No        0.499
##  8 P.S. 134… 01M134        1    40.7   -74.0 293 E … NEW … 10002 No        0.833
##  9 P.S. 140… 01M140        1    40.7   -74.0 123 RI… NEW … 10002 No        0.849
## 10 P.S. 142… 01M142        1    40.7   -74.0 100 AT… NEW … 10002 No        0.861
## # … with 1,262 more rows, 18 more variables: school_income_estimate <dbl>,
## #   percent_ell <dbl>, percent_asian <dbl>, percent_black <dbl>,
## #   percent_hispanic <dbl>, percent_white <dbl>, percent_blhi <dbl>,
## #   attendance_rate <dbl>, chronically_absent <dbl>,
## #   rigorous_instruction <dbl>, collab_teachers <dbl>, supportive_env <dbl>,
## #   leadership <dbl>, fam_com_ties <dbl>, trust <dbl>, ela_proficiency <dbl>,
## #   math_proficiency <dbl>, avg_proficiency <dbl>, and abbreviated variable …
ggplot(school_df, aes(community_school, avg_proficiency)) + 
  geom_boxplot()
## Warning: Removed 55 rows containing non-finite values (`stat_boxplot()`).

# MAJOR INDICATOR

unique(school_df$city)
##  [1] "NEW YORK"            "ROOSEVELT ISLAND"    "BRONX"              
##  [4] "BROOKLYN"            "ELMHURST"            "WOODSIDE"           
##  [7] "CORONA"              "MIDDLE VILLAGE"      "MASPETH"            
## [10] "RIDGEWOOD"           "GLENDALE"            "LONG ISLAND CITY"   
## [13] "FLUSHING"            "COLLEGE POINT"       "WHITESTONE"         
## [16] "BAYSIDE"             "QUEENS VILLAGE"      "LITTLE NECK"        
## [19] "DOUGLASTON"          "FLORAL PARK"         "BELLEROSE"          
## [22] "JAMAICA"             "ARVERNE"             "FAR ROCKAWAY"       
## [25] "SOUTH OZONE PARK"    "BROAD CHANNEL"       "RICHMOND HILL"      
## [28] "WOODHAVEN"           "SOUTH RICHMOND HILL" "OZONE PARK"         
## [31] "ROCKAWAY PARK"       "HOWARD BEACH"        "ROCKAWAY BEACH"     
## [34] "KEW GARDENS"         "FOREST HILLS"        "REGO PARK"          
## [37] "SPRINGFIELD GARDENS" "HOLLIS"              "SAINT ALBANS"       
## [40] "ROSEDALE"            "CAMBRIA HEIGHTS"     "JACKSON HEIGHTS"    
## [43] "ASTORIA"             "EAST ELMHURST"       "STATEN ISLAND"
ggplot(school_df, 
       aes(x = avg_proficiency, y = economic_need_index, 
       color = school_income_estimate)
       ) +
  geom_point() + scale_y_continuous()
## Warning: Removed 55 rows containing missing values (`geom_point()`).

count(school_df, city) # ASSUMING MANHATTAN = NEW YORK
## # A tibble: 45 × 2
##    city                n
##    <chr>           <int>
##  1 ARVERNE             2
##  2 ASTORIA             6
##  3 BAYSIDE            13
##  4 BELLEROSE           4
##  5 BROAD CHANNEL       1
##  6 BRONX             297
##  7 BROOKLYN          411
##  8 CAMBRIA HEIGHTS     2
##  9 COLLEGE POINT       2
## 10 CORONA              9
## # … with 35 more rows
ggplot(filter(school_df, city==c('NEW YORK', 'BRONX')), aes(avg_proficiency, latitude)) + geom_point()
## Warning: Removed 13 rows containing missing values (`geom_point()`).

ggplot(filter(school_df, city==c('NEW YORK', 'BRONX')), aes(avg_proficiency, longitude)) + geom_point()
## Warning: Removed 13 rows containing missing values (`geom_point()`).

While it may seem unnecessary, this was graph was created to determine if there was any correlation between a school’s location.

ggplot(school_df, aes(x=percent_ell, y=avg_proficiency)) + 
  geom_point() + scale_x_sqrt()
## Warning: Removed 55 rows containing missing values (`geom_point()`).

ggplot(school_df, aes(x = percent_white, y = avg_proficiency)) + 
  geom_point() + scale_x_sqrt()
## Warning: Removed 55 rows containing missing values (`geom_point()`).

ggplot(filter(school_df, attendance_rate > 0.7), aes(x = attendance_rate, y = avg_proficiency)) +
  geom_point()
## Warning: Removed 30 rows containing missing values (`geom_point()`).

# OUTLIERS REMOVED TO FOCUS ON THIS DISTRIBUTION
ggplot(filter(school_df, rigorous_instruction > 0), 
       aes(x = rigorous_instruction, y = avg_proficiency)) + 
  geom_point()
## Warning: Removed 30 rows containing missing values (`geom_point()`).

# FILTERING OUT OUTLIERS AT 0
ggplot(filter(school_df, fam_com_ties > 0), 
       aes(x = fam_com_ties, y = avg_proficiency)) + 
  geom_point()
## Warning: Removed 30 rows containing missing values (`geom_point()`).

ggplot(filter(school_df, collab_teachers > 0),
              aes(x = collab_teachers, y = avg_proficiency)) + 
  geom_point()
## Warning: Removed 30 rows containing missing values (`geom_point()`).

ggplot(school_df, aes(x = chronically_absent, y = avg_proficiency)) + 
  geom_point()
## Warning: Removed 55 rows containing missing values (`geom_point()`).

ggplot(filter(school_df, leadership > 0), aes(x = leadership, y = avg_proficiency)) + geom_point()
## Warning: Removed 30 rows containing missing values (`geom_point()`).

ggplot(filter(school_df, trust>0), aes(x = trust, y = avg_proficiency)) + 
  geom_point()
## Warning: Removed 30 rows containing missing values (`geom_point()`).

ggplot(filter(school_df, supportive_env > 0), 
       aes(x = supportive_env, y = avg_proficiency)) +
  geom_point()
## Warning: Removed 30 rows containing missing values (`geom_point()`).

ggplot(filter(school_df, leadership > 0), 
       aes(x = leadership, y = avg_proficiency)) + 
  geom_point()
## Warning: Removed 30 rows containing missing values (`geom_point()`).

school_final <- school_df %>%
  select(-school_name, -location_code, -district, -latitude, -longitude, 
         -address_full, -city, -zip) %>%
  filter(is.na(avg_proficiency) == FALSE)

colMeans(is.na(school_final))
##       community_school    economic_need_index school_income_estimate 
##              0.0000000              0.0000000              0.3196385 
##            percent_ell          percent_asian          percent_black 
##              0.0000000              0.0000000              0.0000000 
##       percent_hispanic          percent_white           percent_blhi 
##              0.0000000              0.0000000              0.0000000 
##        attendance_rate     chronically_absent   rigorous_instruction 
##              0.0000000              0.0000000              0.0000000 
##        collab_teachers         supportive_env             leadership 
##              0.0000000              0.0000000              0.0000000 
##           fam_com_ties                  trust        ela_proficiency 
##              0.0000000              0.0000000              0.0000000 
##       math_proficiency        avg_proficiency 
##              0.0000000              0.0000000

##Linear Regression

Splitting into Training and Testing Sets

set.seed(101010)

school_split <- initial_split(school_final, prop=0.8, strata=avg_proficiency)
school_train <- training(school_split)
school_test <- testing(school_split)

Creating Recipe #1 (Community School as a Dummy Variable)

school_recipe <- recipe(avg_proficiency ~ ., data=school_train) %>%
  step_dummy(all_nominal_predictors()) %>%
  step_rm(c('math_proficiency', 'ela_proficiency', 'percent_asian', 
            'percent_black', 'percent_hispanic', 'percent_white')) %>%
  step_impute_linear(school_income_estimate, 
                     impute_with=imp_vars(
                       economic_need_index, starts_with('community')
                       )) %>%
  step_interact(~ starts_with('community'):school_income_estimate +
                  attendance_rate:chronically_absent) %>%
  step_center(all_numeric_predictors()) %>%
  step_scale(all_numeric_predictors())


lm_model <- linear_reg() %>%
  set_engine('lm')

lm_wkflow <- workflow() %>%
  add_model(lm_model) %>%
  add_recipe(school_recipe)

lm_fit <- fit(lm_wkflow, school_train)

school_recipe
## Recipe
## 
## Inputs:
## 
##       role #variables
##    outcome          1
##  predictor         19
## 
## Operations:
## 
## Dummy variables from all_nominal_predictors()
## Variables removed c("math_proficiency", "ela_proficiency", "percent_asian...
## Linear regression imputation for school_income_estimate
## Interactions with starts_with("community"):school_income_estimate + ...
## Centering for all_numeric_predictors()
## Scaling for all_numeric_predictors()

Metrics

school_train_res <- predict(lm_fit, new_data = school_train %>% 
                              select(-avg_proficiency))
school_test_res <- predict(lm_fit, new_data = school_test %>%
                             select(-avg_proficiency))
school_train_res <- bind_cols(school_train_res, school_train %>% 
                                select(avg_proficiency))

school_train_res %>% ggplot(aes(x=.pred, y=avg_proficiency)) +
  geom_point(alpha = 0.2) +
  geom_abline(lty = 2) +
  theme_minimal() + coord_obs_pred()

school_test_res <- bind_cols(school_test_res, school_test %>%
                               select(avg_proficiency))

school_test_res %>% ggplot(aes(x=.pred, y=avg_proficiency)) +
  geom_point(alpha = 0.2) +
  geom_abline(lty = 2) +
  theme_minimal() + coord_obs_pred()

Metrics

school_metrics <- metric_set(rmse, rsq, mae)
school_test_metrics <- metric_set(rmse, rsq, mae)

Testing Metrics

model_1_metrics <- bind_rows(
  school_test_metrics(school_train_res, truth=avg_proficiency, estimate=.pred),
  school_metrics(school_test_res, truth=avg_proficiency, estimate=.pred)
)
model_1_metrics
## # A tibble: 6 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard       0.176
## 2 rsq     standard       0.824
## 3 mae     standard       0.130
## 4 rmse    standard       0.181
## 5 rsq     standard       0.776
## 6 mae     standard       0.136
school_final
## # A tibble: 1,217 × 20
##    community_s…¹ econo…² schoo…³ perce…⁴ perce…⁵ perce…⁶ perce…⁷ perce…⁸ perce…⁹
##    <chr>           <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
##  1 Yes             0.919  31142.    0.09    0.05    0.32    0.6     0.01    0.92
##  2 No              0.641  56463.    0.05    0.1     0.2     0.63    0.06    0.83
##  3 No              0.744  44343.    0.15    0.35    0.08    0.49    0.04    0.57
##  4 No              0.86   31454     0.07    0.05    0.29    0.63    0.04    0.92
##  5 No              0.73   46436.    0.03    0.04    0.2     0.65    0.1     0.84
##  6 No              0.858  39415.    0.06    0.07    0.19    0.66    0.07    0.84
##  7 No              0.499  43707.    0.01    0.16    0.1     0.43    0.27    0.53
##  8 No              0.833  28821.    0.12    0.21    0.2     0.55    0.03    0.75
##  9 No              0.849  34889.    0.14    0.05    0.13    0.78    0.03    0.9 
## 10 No              0.861  35545.    0.08    0.06    0.11    0.78    0.02    0.9 
## # … with 1,207 more rows, 11 more variables: attendance_rate <dbl>,
## #   chronically_absent <dbl>, rigorous_instruction <dbl>,
## #   collab_teachers <dbl>, supportive_env <dbl>, leadership <dbl>,
## #   fam_com_ties <dbl>, trust <dbl>, ela_proficiency <dbl>,
## #   math_proficiency <dbl>, avg_proficiency <dbl>, and abbreviated variable
## #   names ¹​community_school, ²​economic_need_index, ³​school_income_estimate,
## #   ⁴​percent_ell, ⁵​percent_asian, ⁶​percent_black, ⁷​percent_hispanic, …
filter(school_df, attendance_rate > 0) %>%
  ggplot(aes(x = attendance_rate, y = avg_proficiency, 
             color = chronically_absent)) + 
  geom_point()
## Warning: Removed 30 rows containing missing values (`geom_point()`).

school_train %>%
  ggplot(
    aes(x = school_income_estimate, y = avg_proficiency, 
        shape = community_school)
    ) + geom_point(alpha=0.4)
## Warning: Removed 318 rows containing missing values (`geom_point()`).

Ridge/Lasso Modelling

Splitting the Data and Creating Folds

school_split <- initial_split(school_final, strata = avg_proficiency)
school_train <- training(school_split)
school_test <- testing(school_split)

school_folds <- vfold_cv(school_train, strata=avg_proficiency)

Recipe Creation (Same as Above, but using step_normalize)

school_recipe <- recipe(avg_proficiency ~ ., data=school_train) %>%
  step_dummy(all_nominal_predictors()) %>%
  step_rm(c('math_proficiency', 'ela_proficiency', 'percent_asian', 
            'percent_black', 'percent_hispanic', 'percent_white')) %>%
  step_impute_linear(school_income_estimate, 
                     impute_with=imp_vars(
                       economic_need_index, starts_with('community')
                       )) %>%
  step_interact(~ starts_with('community'):school_income_estimate +
                  attendance_rate:chronically_absent) %>%
  step_normalize(all_numeric_predictors())

Tuning penalty and mixture.

school_spec <- linear_reg(penalty = tune(), mixture = tune()) %>%
  set_mode('regression') %>%
  set_engine('glmnet')

Creating the Workflow and Grid for Tuning

school_wkflow <- workflow() %>%
  add_recipe(school_recipe) %>%
  add_model(school_spec)

pen_mix_grid <- grid_regular(
  penalty(range = c(-5, 5)), mixture(range = c(0, 1)), levels = 10
)

Fitting the Model

tune_res <- tune_grid(
  school_wkflow,
  resamples = school_folds,
  grid = pen_mix_grid
)
## ! Fold01: internal: A correlation computation is required, but `estimate` is constant and ha...
## ! Fold02: internal: A correlation computation is required, but `estimate` is constant and ha...
## ! Fold03: internal: A correlation computation is required, but `estimate` is constant and ha...
## ! Fold04: internal: A correlation computation is required, but `estimate` is constant and ha...
## ! Fold05: internal: A correlation computation is required, but `estimate` is constant and ha...
## ! Fold06: internal: A correlation computation is required, but `estimate` is constant and ha...
## ! Fold07: internal: A correlation computation is required, but `estimate` is constant and ha...
## ! Fold08: internal: A correlation computation is required, but `estimate` is constant and ha...
## ! Fold09: internal: A correlation computation is required, but `estimate` is constant and ha...
## ! Fold10: internal: A correlation computation is required, but `estimate` is constant and ha...

Autoplot

autoplot(tune_res)

Fitting the Model to the Training Set

best_model <- select_best(tune_res, metric='rsq')

school_final_fit <- finalize_workflow(school_wkflow, best_model) %>%
  fit(data = school_train)

train_results <- augment(school_final_fit, new_data = school_train)
train_results %>%
  school_metrics(avg_proficiency, estimate=.pred)
## # A tibble: 3 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard       0.180
## 2 rsq     standard       0.813
## 3 mae     standard       0.133

Fitting the Model to the Testing Set

test_results <- augment(school_final_fit, new_data = school_test)
test_results %>%
  school_metrics(avg_proficiency, estimate=.pred)
## # A tibble: 3 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard       0.168
## 2 rsq     standard       0.813
## 3 mae     standard       0.123

This model has a better \(R^2\) value than the prior, and thus will now be used further.

model_1_metrics
## # A tibble: 6 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard       0.176
## 2 rsq     standard       0.824
## 3 mae     standard       0.130
## 4 rmse    standard       0.181
## 5 rsq     standard       0.776
## 6 mae     standard       0.136

Predicting the Values so they can be Plotted

school_train_res <- predict(school_final_fit, new_data = school_train %>% 
                              select(-avg_proficiency))
school_test_res <- predict(school_final_fit, new_data = school_test %>%
                             select(-avg_proficiency))
train_plot_results <- bind_cols(school_train_res, school_train %>% 
                                select(avg_proficiency))

train_plot_results %>% ggplot(aes(x=.pred, y=avg_proficiency)) +
  geom_point(alpha = 0.2) +
  geom_abline(lty = 2) +
  theme_minimal() + coord_obs_pred() + labs(title='Training Set')

test_plot_results <- bind_cols(school_test_res, school_test %>%
                               select(avg_proficiency))

test_plot_results %>% ggplot(aes(x=.pred, y=avg_proficiency)) +
  geom_point(alpha = 0.2) +
  geom_abline(lty = 2) +
  theme_minimal() + coord_obs_pred() + labs(title='Testing Set')